# The frequency to save trained models when training.
save_step: 10000
# The frequency to fetch and print output when training.
print_step: 100
# Path of the checkpoint, to resume the previous training
init_from_checkpoint: ""
# Path of the pretrain model, to better solve the current task
init_from_pretrain_model: ""
# Path of trained parameter, to make prediction
init_from_params: "./trained_models/step_final/"
# The directory for saving model
save_model: "trained_models"
# The directory for saving inference model.
inference_model_dir: "infer_model"
# Set seed for CE or debug
random_seed: None
# The path to data files 
data: "./gen_data/wikitext-103/"
# The name of dataset
dataset: "wt103"

# Whether to use cuda
use_gpu: True

# Args for reader, see reader.py for details
token_delimiter: None
batch_size: 32
eval_batch_size: 10

# Hyparams for training:
# The number of epoches for training
epoch: 200
# Max step for training.
max_step: 200000

# The hyper parameters for optimizer.
# Type of ptimizer. 
optim: adam
# Learning rate schedule. 
scheduler: cosine
# This static learning_rate will be applied to the LearningRateScheduler
# derived learning rate the to get the final learning rate.
learning_rate: 0.00025
# The hyper parameters for Adam optimizer.
beta1: 0.9
beta2: 0.997
eps: 1e-9
# The hyper parameters for Momentum optimizer.
mom: 0.0
# Global gradient clip. 
clip: 0.25
# The parameters for learning rate scheduling.
warmup_steps: 0
# The parameters for CosineAnnealingDecay. Minimum learning rate.
eta_min: 0.0
# The parameters for ReduceLROnPlateau.
# The Ratio that the learning rate will be reduced. 
decay_rate: 0.5
# When loss doesn’t improve for this number of epochs, learing rate will be reduced.
patience: 0
# The lower bound of the learning rate after reduction.
min_lr: 0.0

# Hyparams for model:
# Whe use adaptive softmax. 
adaptive: True
# Size of dictionary. This can be obtained automatically. 
ntokens: 10000
# The dimension for word embeddings, which is also the last dimension of
# the input and output of multi-head attention, position-wise feed-forward
# networks, encoder and decoder.
d_model: 410
# Dimension of heads.
d_head: 41
# Size of the hidden layer in position-wise feed-forward networks.
d_inner_hid: 2100
# Number of head used in multi-head attention.
n_head: 10
# Number of sub-layers to be stacked in the encoder and decoder.
n_layer: 16
# Dropout rates.
dropout: 0.1
# Attention dropout
attn_dropout: 0.0
# Attention type for decoder. 
# 0 for relative partial MHA (in Transformer-XL). 
# 1 for relative MHA (in Shaw et al). 
attn_type: 0
# Apply layer normalization before or after sublayers. 
normalize_before: False
# Whether to tie weight or not. 
tie_weight: True
# The length of the extended context.
ext_len: 0
# The divident value for softmax and adapative input. 
div_val: 1
# Target length. The number of tokens to predict. 
tgt_len: 150
# Memory length. The length of the retained previous heads. 
mem_len: 150
# Target length for evaluation. That is, the number of tokens to predict for evaluation. 
eval_tgt_len: 150
# Use the same attention length for all tokens. 
same_length: False
# Use the same positional encoding after clamp len. 
clamp_len: -1
# The number of samples in sample softmax. -1 means do not use sampled softmax. 
sample_softmax: -1
# What kind of mode for evaluation. valid, test or both("all"). 
mode: "all"
# Maximum evaluation step. 
max_eval_steps: -1
